If you are using the package for the first time, you will have to first install it.
# install.packages("survival")
library(survival)
pbc <- survival::pbc
Code 1:
x<-10
y<-10/2
z<-x+y
Code 2:
# Store a value
x <- 10
# Take half of the stored value
y <- x/2
# Get the sum of both values
z <- x + y
Which code (1 or 2) would you prefer?
Create a matrix:
A <- matrix(data = rnorm(n = 1e06), nrow = 1000, ncol = 1000)
Calculate the sum per row:
system.time( apply(A, 1, sum) )
## user system elapsed
## 0.03 0.00 0.03
system.time( rowSums(A) )
## user system elapsed
## 0 0 0
Calculate the sum per row, repeat this procedure 100 times:
res1 <- replicate(n = 100, expr = apply(A, 1, sum))
res2 <- replicate(n = 100, expr = rowSums(A)) # use specialized functions
Compute the cumulative sum of a numeric vector. (cumulative sum of 1, 2, 3, 4 is 1, 3, 6, 10) First we create a numerical vector:
x <- rnorm(n = 1e06, mean = 10, sd = 10)
# The cumulative sum in the beginning is 0:
cSum <- 0
# Create an empty vector:
z <- numeric()
for (k in 1:length(x)){
cSum <- cSum + x[k]
z[k] <- cSum
}
Explore how it works:
k <- 1
cSum <- 0
cSum <- cSum + x[k]
k <- 2
cSum <- cSum + x[k]
Use specialized functions:
cSum <- cumsum(x)
Explore the timing of the code.
cSum <- 0
z <- numeric()
system.time({
cSum <- 0
for (i in 1:length(x)){
cSum <- cSum + x[i]
z[i] <- cSum
}
})
## user system elapsed
## 0.23 0.02 0.25
# better
system.time( cumsum(x) )
## user system elapsed
## 0 0 0
More than one ways exist to do something in R!
Calculate the mean weight for males
and females
in 100 datasets.
Since we do not have 100 data sets, we will create them!
Let’s do that first manually…
datlist <- list()
i <- 1
set.seed(123+i)
patient <- c(1:20)
weight <- rnorm(20, 70, 10)
sex <- sample(1:2, 20, replace = TRUE)
sex <- factor(sex, levels = 1:2, labels = c("male", "female"))
datlist[[i]] <- data.frame(patient, weight, sex)
i <- 2
set.seed(123+i)
patient <- c(1:20)
weight <- rnorm(20, 70, 10)
sex <- sample(1:2, 20, replace = T)
sex <- factor(sex, levels = 1:2, labels = c("male", "female"))
datlist[[i]] <- data.frame(patient, weight, sex)
# .....
It is too much work! Now use a loop:
for (i in 1:100) {
set.seed(123+i)
patient <- c(1:20)
weight <- rnorm(20, 70, 10)
sex <- sample(1:2, 20, replace = T)
sex <- factor(sex, levels = 1:2, labels = c("male", "female"))
datlist[[i]] <- data.frame(patient, weight, sex)
}
Now that we have 100 data sets we need to calculate the mean weight
per gender
in each of them.
Let’s do that manually…
means <- matrix(data = NA, nrow = length(datlist), ncol = 2)
i <- 1
dat <- datlist[[i]]
means[i, ] <- tapply(dat$weight, dat$sex, mean)
i <- 2
dat <- datlist[[i]]
means[i, ] <- tapply(dat$weight, dat$sex, mean)
# .....
Now use a loop:
for (i in 1:length(datlist)) {
dat <- datlist[[i]]
means[i, ] <- tapply(dat$weight, dat$sex, mean)
}
means
## [,1] [,2]
## [1,] 72.67093 65.35979
## [2,] 73.58761 70.54592
## [3,] 67.70248 71.18005
## [4,] 68.26480 71.41099
## [5,] 70.32488 73.52906
## [6,] 75.88853 65.13093
## [7,] 70.95698 68.43297
## [8,] 69.47857 68.85621
## [9,] 67.04953 75.86609
## [10,] 67.70928 74.10217
## [11,] 69.48756 71.42291
## [12,] 67.88684 72.51510
## [13,] 67.49031 70.37902
## [14,] 74.62734 72.50830
## [15,] 70.35835 65.61337
## [16,] 69.43763 70.10850
## [17,] 76.18847 76.72335
## [18,] 69.22957 70.67279
## [19,] 70.32851 75.38167
## [20,] 74.26989 67.86045
## [21,] 66.16984 64.05326
## [22,] 73.74897 76.30083
## [23,] 67.17520 69.11970
## [24,] 70.61564 68.93750
## [25,] 74.76173 70.34485
## [26,] 66.01594 63.84540
## [27,] 66.13668 69.79401
## [28,] 69.51881 69.14171
## [29,] 67.92588 70.58051
## [30,] 65.45927 70.48295
## [31,] 69.70336 72.30258
## [32,] 69.22542 71.23817
## [33,] 73.09973 66.59665
## [34,] 72.31334 78.05853
## [35,] 63.94101 67.19878
## [36,] 64.36021 67.98514
## [37,] 72.59174 69.79940
## [38,] 72.38843 75.25830
## [39,] 65.96138 69.01231
## [40,] 67.26918 66.50526
## [41,] 72.93189 69.08579
## [42,] 70.82892 69.04613
## [43,] 73.95355 70.65879
## [44,] 66.95142 66.91990
## [45,] 71.88453 71.12322
## [46,] 71.24994 72.46953
## [47,] 75.84781 72.89258
## [48,] 72.13181 69.25334
## [49,] 72.83673 70.41300
## [50,] 73.42010 77.15810
## [51,] 71.30431 75.75554
## [52,] 75.05188 72.56145
## [53,] 71.41672 71.90575
## [54,] 73.38978 72.47846
## [55,] 73.58065 75.76330
## [56,] 69.32783 76.11282
## [57,] 69.10112 73.41443
## [58,] 68.64874 69.88914
## [59,] 71.54627 73.45020
## [60,] 68.29513 72.58044
## [61,] 69.86955 70.82768
## [62,] 81.05653 67.28846
## [63,] 66.04835 71.89415
## [64,] 72.67207 66.32415
## [65,] 74.50970 64.55474
## [66,] 69.63641 72.14194
## [67,] 71.15808 73.00673
## [68,] 71.27653 75.53417
## [69,] 72.33656 69.00526
## [70,] 72.51226 69.50037
## [71,] 68.70055 69.83255
## [72,] 65.10849 72.32741
## [73,] 68.65566 71.51932
## [74,] 74.09553 74.52641
## [75,] 70.16617 70.32832
## [76,] 74.44624 66.83484
## [77,] 70.87837 68.30180
## [78,] 65.65148 70.65974
## [79,] 71.55960 67.12177
## [80,] 71.62386 69.18857
## [81,] 63.99797 68.16817
## [82,] 72.96528 72.09290
## [83,] 68.39122 71.72255
## [84,] 66.43081 67.23526
## [85,] 72.02173 69.11552
## [86,] 76.02071 66.53986
## [87,] 72.99379 71.37415
## [88,] 69.58583 68.32469
## [89,] 72.60383 65.49518
## [90,] 72.97765 63.53318
## [91,] 64.82462 70.53395
## [92,] 74.47472 69.85521
## [93,] 73.78986 78.60479
## [94,] 68.86772 68.62155
## [95,] 69.59406 69.86890
## [96,] 70.38825 67.42225
## [97,] 64.07105 71.27923
## [98,] 64.77403 70.14883
## [99,] 71.48067 66.56438
## [100,] 73.85559 65.00337
Select the data sets, where more than 39% of the patients are females
:
# We will store the data sets in a list called newList:
newList <- list()
# Using a for loop and an if statement we can go through the data sets and
# check the percentages of females:
for (i in 1:length(datlist)) {
dat <- datlist[[i]]
if (sum(dat$sex == "female")/20 > 0.39) {
newList[[i]] <- dat
}
}
Now we need to remove the elements of the list that are NULL:
newList <- newList[!sapply(newList, is.null)]
length(newList)
## [1] 91
Create a function that takes as input the data sets in a list format, the name of the sex
variable and the name of the female
category.
This function returns only the data sets, where more than 39% of the patients are females
.
The output should be a list.
subset_data <- function(dataset = x, sex_var = "sex", female_cat = "female"){
newList <- list()
for (i in 1:length(dataset)) {
dat <- dataset[[i]]
if (sum(dat[sex_var] == female_cat)/20 > 0.39) {
newList[[i]] <- dat
}
}
newList <- newList[!sapply(newList, is.null)]
}
res <- subset_data(dataset = datlist, sex_var = "sex", female_cat = "female")
length(res)
## [1] 91
Make a function that takes as input a data set, the name of a continuous variable and the name of a categorical variable. This function calculates the mean and standard deviation of the continuous variable for each group in the categorical variable.
des <- function(data = x, cont = "age", cat = "group"){
c(tapply(data[[cont]], data[[cat]], mean),
tapply(data[[cont]], data[[cat]], sd))
}
Apply the function to the pbc
data set. Use age
and sex
as continuous and categorical variables.
des(data = pbc, cont = "age", cat = "sex")
## m f m f
## 55.71072 50.15694 10.97780 10.24065
Why do we use [[]]
and not []
?
data = pbc
cont = "age"
cat = "sex"
data[[cont]] # this is in a vector format
## [1] 58.76523 56.44627 70.07255 54.74059 38.10541 66.25873 55.53457 53.05681
## [9] 42.50787 70.55989 53.71389 59.13758 45.68925 56.22177 64.64613 40.44353
## [17] 52.18344 53.93018 49.56057 59.95346 64.18891 56.27652 55.96715 44.52019
## [25] 45.07324 52.02464 54.43943 44.94730 63.87680 41.38535 41.55236 53.99589
## [33] 51.28268 52.06023 48.61875 56.41068 61.72758 36.62697 55.39220 46.66940
## [41] 33.63450 33.69473 48.87064 37.58248 41.79329 45.79877 47.42779 49.13621
## [49] 61.15264 53.50856 52.08761 50.54073 67.40862 39.19781 65.76318 33.61807
## [57] 53.57153 44.56947 40.39425 58.38193 43.89870 60.70637 46.62834 62.90760
## [65] 40.20260 46.45311 51.28816 32.61328 49.33881 56.39973 48.84600 32.49281
## [73] 38.49418 51.92060 43.51814 51.94251 49.82615 47.94524 46.51608 67.41136
## [81] 63.26352 67.31006 56.01369 55.83025 47.21697 52.75838 37.27858 41.39357
## [89] 52.44353 33.47570 45.60712 76.70910 36.53388 53.91650 46.39014 48.84600
## [97] 71.89322 28.88433 48.46817 51.46886 44.95003 56.56947 48.96372 43.01711
## [105] 34.03970 68.50924 62.52156 50.35729 44.06297 38.91034 41.15264 55.45791
## [113] 51.23340 52.82683 42.63929 61.07050 49.65640 48.85421 54.25599 35.15127
## [121] 67.90691 55.43600 45.82067 52.88980 47.18138 53.59890 44.10404 41.94935
## [129] 63.61396 44.22724 62.00137 40.55305 62.64476 42.33539 42.96783 55.96167
## [137] 62.86105 51.24983 46.76249 54.07529 47.03628 55.72621 46.10267 52.28747
## [145] 51.20055 33.86448 75.01164 30.86379 61.80424 34.98700 55.04175 69.94114
## [153] 49.60438 69.37714 43.55647 59.40862 48.75838 36.49281 45.76044 57.37166
## [161] 42.74333 58.81725 53.49760 43.41410 53.30595 41.35524 60.95825 47.75359
## [169] 35.49076 48.66256 52.66804 49.86995 30.27515 55.56742 52.15332 41.60986
## [177] 55.45243 70.00411 43.94251 42.56810 44.56947 56.94456 40.26010 37.60712
## [185] 48.36140 70.83641 35.79192 62.62286 50.64750 54.52704 52.69268 52.72005
## [193] 56.77207 44.39699 29.55510 57.04038 44.62697 35.79740 40.71732 32.23272
## [201] 41.09240 61.63997 37.05681 62.57906 48.97741 61.99042 72.77207 61.29500
## [209] 52.62423 49.76318 52.91444 47.26352 50.20397 69.34702 41.16906 59.16496
## [217] 36.07940 34.59548 42.71321 63.63039 56.62971 46.26420 61.24298 38.62012
## [225] 38.77070 56.69541 58.95140 36.92266 62.41478 34.60917 58.33539 50.18207
## [233] 42.68583 34.37919 33.18275 38.38193 59.76181 66.41205 46.78987 56.07940
## [241] 41.37440 64.57221 67.48802 44.82957 45.77139 32.95003 41.22108 55.41684
## [249] 47.98084 40.79124 56.97467 68.46270 78.43943 39.85763 35.31006 31.44422
## [257] 58.26420 51.48802 59.96988 74.52430 52.36413 42.78713 34.87474 44.13963
## [265] 46.38193 56.30938 70.90760 55.39493 45.08419 26.27789 50.47228 38.39836
## [273] 47.41958 47.98084 38.31622 50.10815 35.08830 32.50376 56.15332 46.15469
## [281] 65.88364 33.94387 62.86105 48.56400 46.34908 38.85284 58.64750 48.93634
## [289] 67.57290 65.98494 40.90075 50.24504 57.19644 60.53662 35.35113 31.38125
## [297] 55.98631 52.72553 38.09172 58.17112 45.21013 37.79877 60.65982 35.53457
## [305] 43.06639 56.39151 30.57358 61.18275 58.29979 62.33265 37.99863 33.15264
## [313] 60.00000 64.99932 54.00137 75.00068 62.00137 43.00068 46.00137 44.00000
## [321] 60.99932 64.00000 40.00000 63.00068 34.00137 52.00000 48.99932 54.00137
## [329] 63.00068 54.00137 46.00137 52.99932 56.00000 56.00000 55.00068 64.99932
## [337] 56.00000 47.00068 60.00000 52.99932 54.00137 50.00137 48.00000 36.00000
## [345] 48.00000 70.00137 51.00068 52.00000 54.00137 48.00000 66.00137 52.99932
## [353] 62.00137 59.00068 39.00068 67.00068 58.00137 64.00000 46.00137 64.00000
## [361] 40.99932 48.99932 44.00000 59.00068 63.00068 60.99932 64.00000 48.99932
## [369] 42.00137 50.00137 51.00068 36.99932 62.00137 51.00068 52.00000 44.00000
## [377] 32.99932 60.00000 63.00068 32.99932 40.99932 51.00068 36.99932 59.00068
## [385] 55.00068 54.00137 48.99932 40.00000 67.00068 68.00000 40.99932 68.99932
## [393] 52.00000 56.99932 36.00000 50.00137 64.00000 62.00137 42.00137 44.00000
## [401] 68.99932 52.00000 66.00137 40.00000 52.00000 46.00137 54.00137 51.00068
## [409] 43.00068 39.00068 51.00068 67.00068 35.00068 67.00068 39.00068 56.99932
## [417] 58.00137 52.99932
data[cont] # this is in a matrix format
# The tapply works with vector format data.
Now change the des
function so that the user would specify the function applied to the data set.
des <- function(data = x, cont = "age", cat = "group", fun = mean){
tapply(data[[cont]], data[[cat]], fun)
}
des(data = pbc, cont = "age", cat = "sex", fun = function(x) { sum(x) + 1 } )
## m f
## 2452.272 18759.697